Read in PEPPRMINT estimates

Read in neoantigen estimation by NetMHC

netmhc_all  = readRDS("../data/neoAg_netMHCpan4_1.rds")

colnames(netmhc_all)
##  [1] "Pos"          "ID"           "HLA"          "Peptide_mut"  "EL_score_mut"
##  [6] "EL_rank_mut"  "Peptide_ref"  "EL_score_ref" "EL_rank_ref"  "sample"
dim(netmhc_all)
## [1] 1617468      10
head(netmhc_all)
netmhc_all  = as.data.table(netmhc_all)

Taking maximum for each somatic mutation using mutated petides

netmhc_mut = netmhc_all[,.(EL_score_mut_max = max(EL_score_mut)), 
                        by = .(ID, sample)]
dim(netmhc_mut)
## [1] 31294     3
netmhc_mut
netmhc_mut$key = gsub("_", ":", netmhc_mut$ID)
netmhc_mut$sample = gsub("_hlai", "", netmhc_mut$sample)

g1 = ggplot(netmhc_mut, aes(x=EL_score_mut_max)) + xlab("NetMHC score (mut)") + 
  geom_histogram(color="darkblue", fill="lightblue")
g1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Taking maximum for each somatic mutation using reference peptides

netmhc_ref = netmhc_all[,.(EL_score_ref_max = max(EL_score_ref)), 
                        by = .(ID, sample)]
dim(netmhc_ref)
## [1] 31294     3
netmhc_ref
netmhc_ref$key = gsub("_", ":", netmhc_ref$ID)
netmhc_ref$sample = gsub("_hlai", "", netmhc_ref$sample)

g1 = ggplot(netmhc_ref, aes(x=EL_score_ref_max)) + xlab("NetMHC score (ref)") + 
  geom_histogram(color="darkblue", fill="lightblue")
g1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Compare NetMHC estimates between reference and mutation

table(netmhc_mut$ID == netmhc_ref$ID)
## 
##  TRUE 
## 31294
table(netmhc_mut$sample == netmhc_ref$sample)
## 
##  TRUE 
## 31294
netmhc = merge(netmhc_mut, netmhc_ref)
netmhc
gg1 = ggplot(data = netmhc, mapping = aes(x = EL_score_ref_max, 
                                          y = EL_score_mut_max)) +
  geom_pointdensity(size=1) + xlab("NetMHCpan-4.1 ref") + 
  ylab("NetMHCpan-4.1 mut") + 
  scale_color_viridis() + theme(legend.position = "top")

pdf("step4_nb_analysis_files/netmhc_ref_vs_mut.pdf", width=3.5, height=4.2)
gg1
dev.off()
## quartz_off_screen 
##                 2

Read in neoantigen estimation by PEPPRMINT

Results after taking maximum for the scores from mutated peptides

ppmint_mut = fread("../output_PEPPRMINT/PEPPRMINT_Riaz_2017_aggregate.tsv")
dim(ppmint_mut)
## [1] 31298     3
ppmint_mut
summary(ppmint_mut$pepprmint)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000057 0.2962712 0.5442800 0.5168409 0.7257650 0.9994867
setequal(ppmint_mut$sample, netmhc$sample)
## [1] TRUE
length(setdiff(ppmint_mut$key, netmhc$key))
## [1] 1136
g1 = ggplot(ppmint_mut, aes(x=pepprmint)) + xlab("PEPPRMINT score (mut)") + 
  geom_histogram(color="darkblue", fill="lightblue")
g1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Results after taking maximum for the scores from reference peptides

ppmint_ref = fread("../output_PEPPRMINT/PEPPRMINT_Riaz_2017_ref_aggregate.tsv")
dim(ppmint_ref)
## [1] 31298     3
ppmint_ref
summary(ppmint_ref$pepprmint)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000077 0.2605880 0.4955733 0.4915124 0.7253583 0.9992200
table(ppmint_mut$key == ppmint_ref$key)
## 
##  TRUE 
## 31298
table(ppmint_mut$sample == ppmint_ref$sample)
## 
##  TRUE 
## 31298
g1 = ggplot(ppmint_ref, aes(x=pepprmint)) + xlab("PEPPRMINT score (ref)") + 
  geom_histogram(color="darkblue", fill="lightblue")
g1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Compare PEPPRMINT estimates between reference and mutation

ppmint = merge(ppmint_mut, ppmint_ref, by = c("key", "sample"), 
               suffixes = c("_mut", "_ref"))
ppmint
gg2 = ggplot(data = ppmint, mapping = aes(x = pepprmint_ref, y = pepprmint_mut)) +
  geom_pointdensity(size=1) + xlab("PEPPRMINT ref") + ylab("PEPPRMINT mut") + 
  scale_color_viridis() + theme(legend.position = "top")

pdf("step4_nb_analysis_files/pepprmint_ref_vs_mut.pdf", width=3.5, height=4.2)
gg2
dev.off()
## quartz_off_screen 
##                 2

Read in patient information

sample_mb = fread(file = "../data/riaz_patient_mb_info.txt")
dim(sample_mb)
## [1] 104   3
sample_mb[1:2,]
ppmint$Patient = sub("_.*", "", ppmint$sample)
netmhc$Patient = sub("_.*", "", netmhc$sample)
sample_mb$Patient = sub("_.*", "", sample_mb$sample)

clinic = read_excel("../data/_supp/mmc2.xlsx", skip=1)
dim(clinic)
## [1] 73 12
clinic[1:2,]
clinic = as.data.frame(clinic)
table(clinic$Patient %in% sample_mb$Patient)
## 
## FALSE  TRUE 
##     8    65
table(tolower(clinic$Patient) %in% tolower(sample_mb$Patient))
## 
## FALSE  TRUE 
##     8    65
patient = intersect(clinic$Patient, sample_mb$Patient)
length(patient)
## [1] 65
patient = intersect(patient, ppmint$Patient)
length(patient)
## [1] 60
clinic[which(! clinic$Patient %in% patient),]
clinic = clinic[match(patient, clinic$Patient),]
dim(clinic)
## [1] 60 12
head(clinic)
names(clinic)[c(4:5,7)] = c("Dead", "Time", "Mutation")
head(clinic)

Prepare clinical information

table(clinic$Response)
## 
## CR NE PD PR SD 
##  3  3 23 10 21
clinic[["Response3"]] = clinic$Response
clinic$Response3[which(clinic$Response == "NE")] = "PD"
clinic$Response3[which(clinic$Response == "CR")] = "PRCR"
clinic$Response3[which(clinic$Response == "PR")] = "PRCR"

clinic$Response = clinic$Response3
clinic$Response[which(clinic$Response3 == "SD")] = "PDSD"
clinic$Response[which(clinic$Response3 == "PD")] = "PDSD"

table(clinic$Response, clinic$Response3)
##       
##        PD PRCR SD
##   PDSD 26    0 21
##   PRCR  0   13  0
clinic$Cohort[which(clinic$Cohort == "NIV3-NAIVE")] = "Ipi naive"
clinic$Cohort[which(clinic$Cohort == "NIV3-PROG")]  = "Ipi prog"

Get mutation burden information

mat_pre = match(paste0(patient, "_pre"), sample_mb$sample)
clinic[["mb_pre"]] = log10(sample_mb$n_mutations_neoAg[mat_pre] + 1)

mat_on = match(paste0(patient, "_on"), sample_mb$sample)
clinic[["mb_on"]]  = log10(sample_mb$n_mutations_neoAg[mat_on] + 1)

Check the relation between mutation burden and clinical response

clinic$mut_diff = clinic$mb_pre - clinic$mb_on
anova(lm(mut_diff ~ Response, data = clinic))
anova(lm(mb_pre ~ Response, data = clinic))
p0 = ggplot(clinic, aes(x=Response, y=mb_pre)) + 
  geom_boxplot(outlier.shape = NA) + 
  geom_jitter(position=position_jitter(0.2), size=1) + 
  labs(title="", x="Response", y = "log10(MB)") 
p0

p0 = ggplot(clinic, aes(x=Response, y=mb_pre - mb_on)) + 
  geom_boxplot(outlier.shape = NA) + 
  geom_jitter(position=position_jitter(0.2), size=1) + 
  labs(title="", x="Response", y = "log10(MB) decrease") 
p0
## Warning: Removed 23 rows containing non-finite values (stat_boxplot).
## Warning: Removed 23 rows containing missing values (geom_point).

res_cox = coxph(Surv(Time, Dead) ~ mb_pre, data = clinic)
res_cox
## Call:
## coxph(formula = Surv(Time, Dead) ~ mb_pre, data = clinic)
## 
##           coef exp(coef) se(coef)     z     p
## mb_pre 0.04681   1.04793  0.21031 0.223 0.824
## 
## Likelihood ratio test=0.05  on 1 df, p=0.8232
## n= 60, number of events= 35

Estimate neoAg burden

Assess association between neoAg versus cytolytic_score

cutoffs = seq(0.75, 0.95, by=0.05)
pvals   = matrix(NA, nrow=length(cutoffs), ncol=2)

netmhc$sample = as.factor(netmhc$sample)
ppmint$sample = as.factor(ppmint$sample)

clinic$`Cytolytic Score` = as.numeric(clinic$`Cytolytic Score`)
## Warning: NAs introduced by coercion
summary(clinic$cytolytic_score)
## Length  Class   Mode 
##      0   NULL   NULL
g1 = ggplot(clinic, aes(x=log10(`Cytolytic Score`))) + 
  xlab("log10(Cytolytic Score)") + 
  geom_histogram(color="darkblue", fill="lightblue")
g1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 18 rows containing non-finite values (stat_bin).

clinic[["cytolytic_score"]] = log10(clinic$`Cytolytic Score`)
  
for(i in 1:length(cutoffs)){
  ci = cutoffs[i]
  
  tti       = table(netmhc$sample[netmhc$EL_score_mut_max >= ci])
  mat_pre   = match(paste0(patient, "_pre"), names(tti))
  nb_netmhc_pre = log10(tti[mat_pre] + 1)
  pvals[i,1] = anova(lm(clinic$cytolytic_score ~ nb_netmhc_pre))$`Pr(>F)`[1]

  tti       = table(ppmint$sample[ppmint$pepprmint_mut >= ci])
  mat_pre   = match(paste0(patient, "_pre"), names(tti))
  nb_ppmint_pre = log10(tti[mat_pre] + 1)
  pvals[i,2] = anova(lm(clinic$cytolytic_score ~ nb_ppmint_pre))$`Pr(>F)`[1]
}
pvals
##           [,1]       [,2]
## [1,] 0.1785801 0.08309863
## [2,] 0.1951604 0.04960438
## [3,] 0.2340431 0.03979126
## [4,] 0.2656822 0.03564467
## [5,] 0.1552959 0.07081369
pval_df = data.frame(pval = c(pvals), 
                     cutoff = rep(as.character(cutoffs), times=2),
                     method = rep(c("NetMHCpan-4.1", "PEPPRMINT"), each=5))

pm = ggplot(pval_df, aes(x = method, y = -log10(pval), fill = cutoff)) + 
  geom_bar( stat="identity", position=position_dodge()) + 
  scale_fill_brewer(palette="Paired") + theme(legend.position = "top") +  
  geom_hline(yintercept = -log10(0.05), lty=2) + 
  guides(fill=guide_legend(nrow=2, byrow=TRUE))

pdf("step4_nb_analysis_files/nb_vs_cytolytic_pvals.pdf", width=2.8, height=4)
print(pm)
dev.off()
## quartz_off_screen 
##                 2
anova(lm(clinic$cytolytic_score ~ clinic$mb_pre))

Illustrate the relation between neoAg versus cytolytic_score

Here we use 0.9 as cutoff of neoantigen score.

ci = 0.9

tti       = table(netmhc$sample[netmhc$EL_score_mut_max >= ci])
mat_pre   = match(paste0(patient, "_pre"), names(tti))
nb_netmhc_pre = log10(tti[mat_pre] + 1)

mat_on    = match(paste0(patient, "_on"), names(tti))
nb_netmhc_on  = log10(tti[mat_on] + 1)


tti       = table(ppmint$sample[ppmint$pepprmint_mut >= ci])
mat_pre   = match(paste0(patient, "_pre"), names(tti))
nb_ppmint_pre = log10(tti[mat_pre] + 1)

mat_on    = match(paste0(patient, "_on"), names(tti))
nb_ppmint_on  = log10(tti[mat_on] + 1)

clinic[["nb_netmhc_pre"]] = nb_netmhc_pre
clinic[["nb_ppmint_pre"]] = nb_ppmint_pre

clinic2 = clinic[which(!is.na(clinic$`Cytolytic Score`)),]

lm1  = lm(cytolytic_score ~ mb_pre, data=clinic2)
lm1
## 
## Call:
## lm(formula = cytolytic_score ~ mb_pre, data = clinic2)
## 
## Coefficients:
## (Intercept)       mb_pre  
##       2.049        0.157
pval = anova(lm1)$`Pr(>F)`[1]
g1 = ggplot(clinic2, aes(x=mb_pre, y=cytolytic_score)) +
  geom_point() + xlab("mutation burden\n") + geom_smooth(method=lm) + 
  ggtitle(sprintf("pvalue = %.3f", pval))

pval = anova(lm(cytolytic_score ~ nb_netmhc_pre, data=clinic2))$`Pr(>F)`[1]
g2 = ggplot(clinic2, aes(x=nb_netmhc_pre, y=cytolytic_score)) + 
  geom_point() + xlab("neoantigen burden \n(NetMHCpan 4.1)") + 
  geom_smooth(method=lm) + 
  ggtitle(sprintf("pvalue = %.3f", pval))

pval = anova(lm(cytolytic_score ~ nb_ppmint_pre, data=clinic2))$`Pr(>F)`[1]
g3 = ggplot(clinic2, aes(x=nb_ppmint_pre, y=cytolytic_score)) +
  geom_point() + xlab("neoantigen burden \n(PEPPRMINT)") + 
  geom_smooth(method=lm) + 
  ggtitle(sprintf("pvalue = %.3f", pval))

gg1 = ggarrange(g1, g2, g3, ncol = 3, 
          common.legend = TRUE, legend = "bottom")
## `geom_smooth()` using formula 'y ~ x'
## Don't know how to automatically pick scale for object of type table. Defaulting to continuous.
## `geom_smooth()` using formula 'y ~ x'
## Don't know how to automatically pick scale for object of type table. Defaulting to continuous.
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Don't know how to automatically pick scale for object of type table. Defaulting to continuous.
## `geom_smooth()` using formula 'y ~ x'
## Don't know how to automatically pick scale for object of type table. Defaulting to continuous.
## `geom_smooth()` using formula 'y ~ x'
gg1

pdf("step4_nb_analysis_files/nb_vs_cytolytic.pdf", width=8, height=3)
print(gg1)
dev.off()
## quartz_off_screen 
##                 2

Check the relation between neoantigen change and treatment response

ci = 0.9

tti       = table(netmhc$sample[netmhc$EL_score_mut_max >= ci])
mat_pre   = match(paste0(patient, "_pre"), names(tti))
nb_netmhc_pre = log10(as.numeric(tti[mat_pre]) + 1)

mat_on    = match(paste0(patient, "_on"), names(tti))
nb_netmhc_on  = log10(as.numeric(tti[mat_on]) + 1)

tti       = table(ppmint$sample[ppmint$pepprmint_mut >= ci])
mat_pre   = match(paste0(patient, "_pre"), names(tti))
nb_ppmint_pre = log10(as.numeric(tti[mat_pre]) + 1)

mat_on    = match(paste0(patient, "_on"), names(tti))
nb_ppmint_on  = log10(as.numeric(tti[mat_on]) + 1)

clinic[["nb_netmhc_pre"]] = nb_netmhc_pre
clinic[["nb_ppmint_pre"]] = nb_ppmint_pre

clinic[["nb_netmhc_on"]] = nb_netmhc_on
clinic[["nb_ppmint_on"]] = nb_ppmint_on

clinic$mut_diff = clinic$mb_pre - clinic$mb_on
anova(lm(mut_diff ~ Response, data = clinic))
clinic$nb_netmhc_diff = clinic$nb_netmhc_pre - clinic$nb_netmhc_on
anova(lm(nb_netmhc_diff ~ Response, data = clinic))
clinic$nb_ppmint_diff = clinic$nb_ppmint_pre - clinic$nb_ppmint_on
anova(lm(nb_ppmint_diff ~ Response, data = clinic))
table(clinic$Response, is.na(clinic$mut_diff))
##       
##        FALSE TRUE
##   PDSD    29   18
##   PRCR     8    5
table(clinic$Response, is.na(clinic$nb_netmhc_diff))
##       
##        FALSE TRUE
##   PDSD    27   20
##   PRCR     3   10
table(clinic$Response, is.na(clinic$nb_ppmint_diff))
##       
##        FALSE TRUE
##   PDSD    27   20
##   PRCR     3   10
clinic[clinic$Response == "PRCR" & !is.na(clinic$nb_ppmint_diff),]
clinic$Response = factor(clinic$Response, levels = c("PDSD", "PRCR"))

g0 = ggplot(clinic, aes(x = nb_netmhc_pre, y = nb_netmhc_on, color=Response)) +
    geom_jitter(width = 0.02, height=0.02, size = 1) + 
  xlab("pre-therapy") + ylab("on-therapy") + 
  ggtitle("NetMHCpan-4.1 \n neoantigen burden") + xlim(-0.02, 2.2)

g1 = ggplot(clinic, aes(x = nb_ppmint_pre, y = nb_ppmint_on, color=Response)) +
    geom_jitter(width = 0.02, height=0.02, size = 1) + 
  xlab("pre-therapy") + ylab("on-therapy") + 
  ggtitle("PEPPRMINT \n neoantigen burden") + xlim(-0.02, 2.2)

gg2 = ggarrange(g0, g1, ncol = 2, common.legend = TRUE, legend = "bottom", 
                labels = c("(A)", "(B)"))
## Warning: Removed 30 rows containing missing values (geom_point).

## Warning: Removed 30 rows containing missing values (geom_point).

## Warning: Removed 30 rows containing missing values (geom_point).
gg2

pdf("step4_nb_analysis_files/compare_nb_pre_on.pdf", width=5.6, height=3)
print(gg2)
dev.off()
## quartz_off_screen 
##                 2

Select potential neoantigens

cols = c("sample", "key", "EL_score_mut_max", "EL_score_ref_max", "Patient")
netmhc = netmhc[, ..cols]
median(netmhc$EL_score_ref_max)
## [1] 0.2206
table(netmhc$EL_score_mut_max > 0.8 & netmhc$EL_score_ref_max < 0.22)
## 
## FALSE  TRUE 
## 30981   313
table(netmhc$EL_score_mut_max > 0.8 & netmhc$EL_score_ref_max < 0.2)
## 
## FALSE  TRUE 
## 31006   288
table(netmhc$EL_score_ref_max > 0.8 & netmhc$EL_score_mut_max < 0.2)
## 
## FALSE  TRUE 
## 31115   179
table(netmhc$EL_score_mut_max > 0.9 & netmhc$EL_score_ref_max < 0.22)
## 
## FALSE  TRUE 
## 31153   141
table(netmhc$EL_score_mut_max > 0.9 & netmhc$EL_score_ref_max < 0.1)
## 
## FALSE  TRUE 
## 31223    71
table(netmhc$EL_score_ref_max > 0.9 & netmhc$EL_score_mut_max < 0.1)
## 
## FALSE  TRUE 
## 31251    43
cols = c("sample", "key", "pepprmint_mut", "pepprmint_ref", "Patient")
ppmint = ppmint[, ..cols]

median(ppmint$pepprmint_ref)
## [1] 0.4955733
table(ppmint$pepprmint_mut > 0.8 & ppmint$pepprmint_ref < 0.5)
## 
## FALSE  TRUE 
## 30830   468
table(ppmint$pepprmint_mut > 0.8 & ppmint$pepprmint_ref < 0.2)
## 
## FALSE  TRUE 
## 31206    92
table(ppmint$pepprmint_ref > 0.8 & ppmint$pepprmint_mut < 0.2)
## 
## FALSE  TRUE 
## 31271    27
table(ppmint$pepprmint_mut > 0.9 & ppmint$pepprmint_ref < 0.5)
## 
## FALSE  TRUE 
## 31150   148
table(ppmint$pepprmint_mut > 0.9 & ppmint$pepprmint_ref < 0.1)
## 
## FALSE  TRUE 
## 31290     8
table(ppmint$pepprmint_ref > 0.9 & ppmint$pepprmint_mut < 0.1)
## 
## FALSE  TRUE 
## 31297     1
neoAg_netMHC = netmhc[EL_score_mut_max > 0.9 & EL_score_ref_max < 0.22]
neoAg_ppmint = ppmint[pepprmint_mut > 0.9 & ppmint$pepprmint_ref < 0.5]

dim(neoAg_netMHC)
## [1] 141   5
dim(neoAg_ppmint)
## [1] 148   5
neoAg_netMHC[1:2,]
neoAg_ppmint[1:2,]
t1 = table(neoAg_netMHC$key)
table(t1)
## t1
##  1  2 
## 91 25
t2 = table(neoAg_ppmint$key)
table(t2)
## t2
##   1   2 
## 106  21
df_both = merge(neoAg_netMHC, neoAg_ppmint, by=c("sample", "key"))
dim(df_both)
## [1] 34  8
df_both[1:2,]
t1 = table(neoAg_netMHC$sample)
t2 = table(neoAg_ppmint$sample)
table(t1)
## t1
##  0  1  2  3  4  5  6 35 
## 38 26 12  5  2  3  3  1
table(t2)
## t2
##  0  1  2  3  4  5  7 10 54 
## 49 17  8  9  3  1  1  1  1
sort(t1, decreasing = TRUE)[1:10]
## 
##  Pt54_pre   Pt47_on  Pt47_pre  Pt65_pre   Pt7_pre  Pt72_pre  Pt87_pre  Pt44_pre 
##        35         6         6         6         5         5         5         4 
##  Pt94_pre Pt106_pre 
##         4         3
sort(t2, decreasing = TRUE)[1:10]
## 
##  Pt54_pre   Pt7_pre Pt106_pre  Pt49_pre   Pt23_on   Pt4_pre  Pt65_pre  Pt23_pre 
##        54        10         7         5         4         4         4         3 
##   Pt60_on  Pt60_pre 
##         3         3
t1p = t1[t1 > 0]
t2p = t2[t2 > 0]
length(t1p)
## [1] 52
length(t2p)
## [1] 41
length(intersect(names(t1p), names(t2p)))
## [1] 37
t1 = table(neoAg_netMHC$Patient)
t2 = table(neoAg_ppmint$Patient)
table(t1)
## t1
##  1  2  3  4  5  6 12 35 
## 16  6  3  6  3  3  1  1
table(t2)
## t2
##  1  2  3  4  5  6  7 10 54 
##  8  7  3  4  1  3  2  1  1
sort(t1, decreasing = TRUE)[1:10]
## 
## Pt54 Pt47 Pt65  Pt8 Pt92  Pt7 Pt72 Pt87 Pt13  Pt3 
##   35   12    6    6    6    5    5    5    4    4
sort(t2, decreasing = TRUE)[1:10]
## 
##  Pt54   Pt7 Pt106  Pt23  Pt60   Pt8  Pt86  Pt49   Pt4  Pt47 
##    54    10     7     7     6     6     6     5     4     4
length(intersect(names(t1), names(t2)))
## [1] 28
table(clinic$Patient %in% names(t1), clinic$Response)
##        
##         PDSD PRCR
##   FALSE   20    1
##   TRUE    27   12
table(clinic$Patient %in% names(t2), clinic$Response)
##        
##         PDSD PRCR
##   FALSE   27    3
##   TRUE    20   10
fisher.test(clinic$Patient %in% names(t1), clinic$Response)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  clinic$Patient %in% names(t1) and clinic$Response
## p-value = 0.02282
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##    1.107098 397.824196
## sample estimates:
## odds ratio 
##   8.644851
fisher.test(clinic$Patient %in% names(t2), clinic$Response)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  clinic$Patient %in% names(t2) and clinic$Response
## p-value = 0.05747
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##   0.9634899 28.0179208
## sample estimates:
## odds ratio 
##   4.389661

Add mutation annotation data

load("../data/riaz_mutdata.RData")
dim(riaz_mutdata)
## [1] 31303   104
names(riaz_mutdata)
##   [1] "id"                                      
##   [2] "QSS"                                     
##   [3] "TQSS"                                    
##   [4] "NT"                                      
##   [5] "QSS_NT"                                  
##   [6] "TQSS_NT"                                 
##   [7] "SGT"                                     
##   [8] "SOMATIC"                                 
##   [9] "seqnames"                                
##  [10] "start"                                   
##  [11] "end"                                     
##  [12] "REF"                                     
##  [13] "ALT"                                     
##  [14] "nAltTumor"                               
##  [15] "nRefTumor"                               
##  [16] "nAltNormal"                              
##  [17] "nRefNormal"                              
##  [18] "rdNormalFiltered"                        
##  [19] "rdTumorFiltered"                         
##  [20] "type"                                    
##  [21] "vafTumor"                                
##  [22] "vafNormal"                               
##  [23] "contig"                                  
##  [24] "position"                                
##  [25] "context"                                 
##  [26] "ref_allele"                              
##  [27] "alt_allele"                              
##  [28] "tumor_name"                              
##  [29] "normal_name"                             
##  [30] "score"                                   
##  [31] "dbsnp_site"                              
##  [32] "covered"                                 
##  [33] "power"                                   
##  [34] "tumor_power"                             
##  [35] "normal_power"                            
##  [36] "normal_power_nsp"                        
##  [37] "normal_power_wsp"                        
##  [38] "total_reads"                             
##  [39] "map_Q0_reads"                            
##  [40] "init_t_lod"                              
##  [41] "t_lod_fstar"                             
##  [42] "t_lod_fstar_forward"                     
##  [43] "t_lod_fstar_reverse"                     
##  [44] "tumor_f"                                 
##  [45] "contaminant_fraction"                    
##  [46] "contaminant_lod"                         
##  [47] "t_q20_count"                             
##  [48] "t_ref_count"                             
##  [49] "t_alt_count"                             
##  [50] "t_ref_sum"                               
##  [51] "t_alt_sum"                               
##  [52] "t_ref_max_mapq"                          
##  [53] "t_alt_max_mapq"                          
##  [54] "t_ins_count"                             
##  [55] "t_del_count"                             
##  [56] "normal_best_gt"                          
##  [57] "init_n_lod"                              
##  [58] "normal_f"                                
##  [59] "n_q20_count"                             
##  [60] "n_ref_count"                             
##  [61] "n_alt_count"                             
##  [62] "n_ref_sum"                               
##  [63] "n_alt_sum"                               
##  [64] "power_to_detect_positive_strand_artifact"
##  [65] "power_to_detect_negative_strand_artifact"
##  [66] "strand_bias_counts"                      
##  [67] "tumor_alt_fpir_median"                   
##  [68] "tumor_alt_fpir_mad"                      
##  [69] "tumor_alt_rpir_median"                   
##  [70] "tumor_alt_rpir_mad"                      
##  [71] "observed_in_normals_count"               
##  [72] "failure_reasons"                         
##  [73] "judgement"                               
##  [74] "Func.ensGene"                            
##  [75] "Gene.ensGene"                            
##  [76] "GeneDetail.ensGene"                      
##  [77] "ExonicFunc.ensGene"                      
##  [78] "AAChange.ensGene"                        
##  [79] "SIFT_score"                              
##  [80] "SIFT_converted_rankscore"                
##  [81] "SIFT_pred"                               
##  [82] "MutationTaster_pred"                     
##  [83] "MutationAssessor_pred"                   
##  [84] "FATHMM_pred"                             
##  [85] "PROVEAN_pred"                            
##  [86] "MetaSVM_pred"                            
##  [87] "MetaLR_pred"                             
##  [88] "GTEx_V6_gene"                            
##  [89] "GTEx_V6_tissue"                          
##  [90] "ExAC_nontcga_ALL"                        
##  [91] "avsnp150"                                
##  [92] "CLINSIG"                                 
##  [93] "CLNDBN"                                  
##  [94] "CLNACC"                                  
##  [95] "CLNDSDB"                                 
##  [96] "CLNDSDBID"                               
##  [97] "nonSynonymous"                           
##  [98] "W"                                       
##  [99] "pos"                                     
## [100] "M"                                       
## [101] "Wseq"                                    
## [102] "seqpos"                                  
## [103] "Mseq"                                    
## [104] "EnsembleID"
riaz_mutdata = riaz_mutdata[,c(1, 9:13, 74:75, 77:78)]
riaz_mutdata$key = paste(gsub("chr", "", riaz_mutdata$seqnames), 
                         riaz_mutdata$start, sep=":")
riaz_mutdata$key = paste(riaz_mutdata$key, 
                         riaz_mutdata$REF, sep=":")
riaz_mutdata$key = paste(riaz_mutdata$key, 
                         riaz_mutdata$ALT, sep=":")
riaz_mutdata[1:2,]
table(neoAg_ppmint$key %in% riaz_mutdata$key)
## 
## TRUE 
##  148
mat1 = match(neoAg_netMHC$key, riaz_mutdata$key)
neoAg_netMHC_with_anno = cbind(neoAg_netMHC, riaz_mutdata[mat1,-(1:6)])

mat2 = match(neoAg_ppmint$key, riaz_mutdata$key)
neoAg_ppmint_with_anno = cbind(neoAg_ppmint, riaz_mutdata[mat2,-(1:6)])

t1 = table(neoAg_netMHC_with_anno$Gene.ensGene)
table(t1)
## t1
##  1  2  3 
## 81 24  2
sort(t1, decreasing = TRUE)[1:10]
## 
##   SEZ6L     TTN   AGBL1  ATP2B2     C8A    CD33 COL12A1  COL6A2   DNAH2   EXOC6 
##       3       3       2       2       2       2       2       2       2       2
t2 = table(neoAg_ppmint_with_anno$Gene.ensGene)
table(t2)
## t2
##  1  2 
## 96 26
sort(t2, decreasing = TRUE)[1:10]
## 
##                AGBL1 AP000275.65;C21orf59                 CD33 
##                    2                    2                    2 
##              COL12A1               COL6A2               COL6A5 
##                    2                    2                    2 
##              FAM135B                 FGD2                 GLRB 
##                    2                    2                    2 
##                MACC1 
##                    2
fwrite(neoAg_netMHC_with_anno, 
       file="step4_nb_analysis_files/neoAg_netMHC_with_anno.tsv", sep = "\t")

fwrite(neoAg_ppmint_with_anno, 
       file="step4_nb_analysis_files/neoAg_ppmint_with_anno.tsv", sep = "\t")

SessionInfo

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] viridis_0.5.1        viridisLite_0.3.0    dplyr_1.0.2         
##  [4] survminer_0.4.8      ggpubr_0.4.0         survival_3.2-7      
##  [7] readxl_1.3.1         ggpointdensity_0.1.0 ggplot2_3.3.3       
## [10] data.table_1.13.6   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.5         lattice_0.20-41    tidyr_1.1.2        zoo_1.8-8         
##  [5] digest_0.6.27      R6_2.5.0           cellranger_1.1.0   backports_1.2.1   
##  [9] evaluate_0.14      pillar_1.4.7       rlang_0.4.10       curl_4.3          
## [13] car_3.0-10         Matrix_1.3-0       rmarkdown_2.6      labeling_0.4.2    
## [17] splines_4.0.3      stringr_1.4.0      foreign_0.8-81     munsell_0.5.0     
## [21] broom_0.7.3        compiler_4.0.3     xfun_0.19          pkgconfig_2.0.3   
## [25] mgcv_1.8-33        htmltools_0.5.0    tidyselect_1.1.0   tibble_3.0.4      
## [29] gridExtra_2.3      km.ci_0.5-2        rio_0.5.16         crayon_1.3.4      
## [33] withr_2.3.0        grid_4.0.3         nlme_3.1-151       jsonlite_1.7.2    
## [37] xtable_1.8-4       gtable_0.3.0       lifecycle_0.2.0    magrittr_2.0.1    
## [41] KMsurv_0.1-5       scales_1.1.1       zip_2.1.1          stringi_1.5.3     
## [45] carData_3.0-4      farver_2.0.3       ggsignif_0.6.0     ellipsis_0.3.1    
## [49] survMisc_0.5.5     generics_0.1.0     vctrs_0.3.6        cowplot_1.1.1     
## [53] openxlsx_4.2.3     RColorBrewer_1.1-2 tools_4.0.3        forcats_0.5.0     
## [57] glue_1.4.2         purrr_0.3.4        hms_0.5.3          abind_1.4-5       
## [61] yaml_2.2.1         colorspace_2.0-0   rstatix_0.6.0      knitr_1.30        
## [65] haven_2.3.1